#Script runs 1000 simulations of Wordfish models on topic 5
#Required input is the STM model (modelctt1_88.rds)
#and the corresponding DFM (dfm88.Rdata)
#Separate output files for positions (out_pos)
#and term discrimination parameters (out_feat). 

# Clear environment and run garbage collection
rm(list=ls())
gc()

# Load necessary libraries
library(data.table)
library(dplyr)
library(ggplot2)
library(magrittr)
library(RSQLite)
library(quanteda)
library(shiny)
library(shinyBS)
library(SnowballC)
library(stminsights)
library(stringr)
library(tidytext)
library(tidyr)
library(wordcloud)
library(quanteda.textplots)
library(quanteda.textmodels)
library(matrixStats)
library(foreign)
library(readxl)
library(stm)
library(openxlsx)


# Load model and data
model <- readRDS("modelctt1_88.rds")
load("dfmK88.Rdata")

# Calculate scores for exclusivity scores
FREQSCORE <- apply(model$beta$logbeta[[1]], 1, data.table::frank)/
  ncol(model$beta$logbeta[[1]])
EXCLSCORE <- apply(t(t(model$beta$logbeta[[1]]) -
                       matrixStats::colLogSumExps((model$beta$logbeta[[1]]))),
                   1, data.table::frank) / ncol(model$beta$logbeta[[1]])
FREXSCORE <- 1 / (0.5 / FREQSCORE + (1 - 0.5) / EXCLSCORE)
rownames(FREXSCORE) <- rownames(FREQSCORE) <- rownames(EXCLSCORE)
measurement <- EXCLSCORE

# Initialize result data frames
thetaout <- data.frame(ID = dfm@docvars$ID)
thetaresult <- data.frame(ccode = dfm@docvars$ccode,
                          year = dfm@docvars$year,
                          ID = dfm@docvars$ID)

lo95out <- data.frame(ID=dfm@docvars$ID)
lo95result <- data.frame(ccode = dfm@docvars$ccode,
                         year = dfm@docvars$year,
                         ID = dfm@docvars$ID)

hi95out <- data.frame(ID = dfm@docvars$ID)
hi95result <- data.frame(ccode = dfm@docvars$ccode,
                         year = dfm@docvars$year,
                         ID = dfm@docvars$ID)

alphaout <- data.frame(ID = dfm@docvars$ID)
alpharesult <- data.frame(ccode = dfm@docvars$ccode,
                          year = dfm@docvars$year,
                          ID = dfm@docvars$ID)

seout <- data.frame(ID = dfm@docvars$ID)
seresult <- data.frame(ccode = dfm@docvars$ccode,
                       year = dfm@docvars$year,
                       ID = dfm@docvars$ID)

betaout <- data.frame(features2 = dfm@Dimnames$features)
betaresult <- data.frame(features = dfm@Dimnames$features)

psiout <- data.frame(features2 = dfm@Dimnames$features)
psiresult <- data.frame(features = dfm@Dimnames$features)

feat <- data.frame(features = dfm@Dimnames$features)

dfmat <- dfm
rm(dfm)
gc()
dfmat <- dfm_group(dfmat, groups = c(ID))

# Main loop 
i <- 5
set.seed(123)
iterations <- 1000

for(j in 1:iterations){
  cat("\n Iteration", j)
  clock <- Sys.time()
  
  measurement2 <- data.frame(measurement)
  sampled <- numeric(length(rownames(measurement2)))
  for (k in seq_along(rownames(measurement2))) sampled[[k]] <-
    rbinom(1, 1, prob = measurement2[k, i])
  ftd <- data.frame(feat, sampled)
  ftd <- subset(ftd, sampled == 0, select = c(features))
  dict <- dictionary(list(countries = ftd$features))
  
  sampled_dfm <- dfm_select(dfmat, pattern = dict, selection = "remove")

  tmod_wf <- textmodel_wordfish(sampled_dfm, dir = c(6, 500))
  
  if(tmod_wf$theta[1] == "NaN"){
    j <- j - 1}
  else {
    thetao <- data.frame(theta = tmod_wf$theta, ID = tmod_wf$x@docvars$ID)
    thetas <- left_join(thetaout, thetao)
    names(thetas)[2] <- paste0("theta", j)
    thetas <- subset(thetas, select = -c(ID))
    if(j == 1){
      ref_theta <- thetas
    }
    dot <- data.frame(ref_theta$theta1, thetas)
    tess = cor(na.omit(dot))
    if(tess[1, 2] < 0){
      thetas <- -1 * thetas
    }
    thetaresult <- data.frame(thetaresult, thetas)
    
    alphao <- data.frame(alpha = tmod_wf$alpha, ID = tmod_wf$x@docvars$ID)
    alphas <- left_join(alphaout, alphao)
    names(alphas)[2] <- paste0("alpha", j)
    alphas <- subset(alphas, select = -c(ID))
    if(tess[1, 2] < 0){
      alphas <- -1 * alphas
    }
    alpharesult <- data.frame(alpharesult, alphas)
    
    seo <- data.frame(se = tmod_wf$se.theta, ID = tmod_wf$x@docvars$ID)
    ses <- left_join(seout, seo)
    names(ses)[2] <- paste0("se", j)
    ses <- subset(ses, select = -c(ID))
    if(tess[1, 2] < 0){
      ses <- -1 * ses
    }
    seresult <- data.frame(seresult, ses)
    
    lo95o <- thetao$theta - seo$se * 1.95
    lo95o <- data.frame(lo = lo95o, ID = tmod_wf$x@docvars$ID)
    lo95s <- left_join(lo95out, lo95o)
    names(lo95s)[2] <- paste0("lo95", j)
    lo95s <- subset(lo95s, select = -c(ID))
    if(tess[1, 2] < 0){
      lo95s <- -1 * lo95s
    }
    lo95result <- data.frame(lo95result, lo95s)
    
    hi95o <- thetao$theta + seo$se * 1.95
    hi95o <- data.frame(hi = hi95o, ID = tmod_wf$x@docvars$ID)
    hi95s <- left_join(hi95out, hi95o)
    names(hi95s)[2] <- paste0("hi95", j)
    hi95s <- subset(hi95s, select = -c(ID))
    if(tess[1, 2] < 0){
      hi95s <- -1 * hi95s
    }
    hi95result <- data.frame(hi95result, hi95s)
    
    betao <- data.frame(beta = tmod_wf$beta, features2 = tmod_wf$features)
    betas <- left_join(betaout, betao)
    names(betas)[2] <- paste0("beta", j)
    betas <- subset(betas, select = -c(features2))
    if(tess[1, 2] < 0){
      betas <- -1 * betas
    }
    betaresult <- data.frame(betaresult, betas)
    
    psio <- data.frame(psi = tmod_wf$psi, features2 = tmod_wf$features)
    psis <- left_join(psiout, psio)
    names(psis)[2] <- paste0("psi", j)
    psis <- subset(psis, select = -c(features2))
    if(tess[1, 2] < 0){
      psis <- -1 * psis
    }
    psiresult <- data.frame(psiresult, psis)
  }}


#Summaries simulations and write to Stata file
thetaresult$thetSD <- apply(subset(thetaresult,
                                   select = -c(year, ccode, ID)),
                            1, sd, na.rm = TRUE)
thetaresult$thetM <- apply(subset(thetaresult,
                                  select = -c(year, ccode, ID, thetSD)),
                           1, mean, na.rm = TRUE)
alpharesult$alphM <- apply(subset(alpharesult,
                                  select = -c(year, ccode, ID)),
                           1, mean, na.rm = TRUE)
hi95result$hiM <- apply(subset(hi95result,
                               select = -c(year, ccode, ID)),
                        1, mean, na.rm = TRUE)
lo95result$loM <- apply(subset(lo95result,
                               select = -c(year, ccode, ID)),
                        1, mean, na.rm = TRUE)

out_pos <- data.frame(ccode = thetaresult$ccode, year = thetaresult$year,
                      thet_top5 = thetaresult$thetM,
                      SD_top5 = thetaresult$thetSD,
                      lo_top5 = lo95result$loM,
                      hi_top5 = hi95result$hiM,
                      alph_top5 = alpharesult$alphM)
write.dta(out_pos, "pos_top5_words.dta")

betaresult$betSD <- apply(subset(betaresult, select = -c(features)), 1,
                          sd, na.rm = TRUE)
betaresult$betM <- apply(subset(betaresult, select = -c(features, betSD)), 1,
                         mean, na.rm = TRUE)
psiresult$psiM <- apply(subset(psiresult, select = -c(features)), 1,
                        mean, na.rm = TRUE)
out_feat <- data.frame(features = betaresult$features,
                       bet_top5 = betaresult$betM,
                       psi_top5 = psiresult$psiM,
                       betSD_top5 = betaresult$betSD)
write.dta(out_feat, "feat_top5_words.dta")
